admissionsData <- read.csv("Admission_Predict_Ver1.1.csv")
summary (admissionsData)
## Serial.No. GRE.Score TOEFL.Score University.Rating
## Min. : 1.0 Min. :290.0 Min. : 92.0 Min. :1.000
## 1st Qu.:125.8 1st Qu.:308.0 1st Qu.:103.0 1st Qu.:2.000
## Median :250.5 Median :317.0 Median :107.0 Median :3.000
## Mean :250.5 Mean :316.5 Mean :107.2 Mean :3.114
## 3rd Qu.:375.2 3rd Qu.:325.0 3rd Qu.:112.0 3rd Qu.:4.000
## Max. :500.0 Max. :340.0 Max. :120.0 Max. :5.000
## SOP LOR CGPA Research
## Min. :1.000 Min. :1.000 Min. :6.800 Min. :0.00
## 1st Qu.:2.500 1st Qu.:3.000 1st Qu.:8.127 1st Qu.:0.00
## Median :3.500 Median :3.500 Median :8.560 Median :1.00
## Mean :3.374 Mean :3.484 Mean :8.576 Mean :0.56
## 3rd Qu.:4.000 3rd Qu.:4.000 3rd Qu.:9.040 3rd Qu.:1.00
## Max. :5.000 Max. :5.000 Max. :9.920 Max. :1.00
## Chance.of.Admit
## Min. :0.3400
## 1st Qu.:0.6300
## Median :0.7200
## Mean :0.7217
## 3rd Qu.:0.8200
## Max. :0.9700
head(admissionsData)
## Serial.No. GRE.Score TOEFL.Score University.Rating SOP LOR CGPA Research
## 1 1 337 118 4 4.5 4.5 9.65 1
## 2 2 324 107 4 4.0 4.5 8.87 1
## 3 3 316 104 3 3.0 3.5 8.00 1
## 4 4 322 110 3 3.5 2.5 8.67 1
## 5 5 314 103 2 2.0 3.0 8.21 0
## 6 6 330 115 5 4.5 3.0 9.34 1
## Chance.of.Admit
## 1 0.92
## 2 0.76
## 3 0.72
## 4 0.80
## 5 0.65
## 6 0.90
attach(admissionsData)
#Linear Regression and some plots
#Here's a linear model (Chance of Admit)
linear <- lm(Chance.of.Admit ~., data=admissionsData)
summary(linear)
##
## Call:
## lm(formula = Chance.of.Admit ~ ., data = admissionsData)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.248847 -0.025984 0.006627 0.036671 0.150015
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.3379983 0.1030617 -12.982 < 2e-16 ***
## Serial.No. 0.0000868 0.0000187 4.641 4.44e-06 ***
## GRE.Score 0.0019217 0.0004923 3.903 0.000108 ***
## TOEFL.Score 0.0031928 0.0008594 3.715 0.000227 ***
## University.Rating 0.0053164 0.0037273 1.426 0.154405
## SOP 0.0045661 0.0045161 1.011 0.312489
## LOR 0.0149151 0.0040757 3.660 0.000280 ***
## CGPA 0.1155561 0.0095282 12.128 < 2e-16 ***
## Research 0.0225254 0.0064834 3.474 0.000557 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.05877 on 491 degrees of freedom
## Multiple R-squared: 0.8294, Adjusted R-squared: 0.8266
## F-statistic: 298.4 on 8 and 491 DF, p-value: < 2.2e-16
plot(linear)
#Here's a linear model (University Rating)
linear <- lm(University.Rating ~., data=admissionsData)
summary(linear)
##
## Call:
## lm(formula = University.Rating ~ ., data = admissionsData)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.34889 -0.46404 -0.02909 0.43638 2.53513
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -5.3520556 1.4229030 -3.761 0.000189 ***
## Serial.No. 0.0001131 0.0002308 0.490 0.624275
## GRE.Score 0.0050723 0.0060361 0.840 0.401135
## TOEFL.Score 0.0184033 0.0104963 1.753 0.080172 .
## SOP 0.4420126 0.0508516 8.692 < 2e-16 ***
## LOR 0.1376178 0.0495241 2.779 0.005665 **
## CGPA 0.2666732 0.1306889 2.041 0.041833 *
## Research 0.0744728 0.0792227 0.940 0.347657
## Chance.of.Admit 0.7761573 0.5441596 1.426 0.154405
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7101 on 491 degrees of freedom
## Multiple R-squared: 0.6205, Adjusted R-squared: 0.6144
## F-statistic: 100.4 on 8 and 491 DF, p-value: < 2.2e-16
plot(linear)
logmod <- glm(Research~., data=admissionsData)
summary(logmod)
##
## Call:
## glm(formula = Research ~ ., data = admissionsData)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.01724 -0.33223 0.00753 0.29143 0.99776
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -4.738e+00 7.931e-01 -5.974 4.44e-09 ***
## Serial.No. 7.426e-05 1.313e-04 0.565 0.572113
## GRE.Score 1.921e-02 3.327e-03 5.776 1.36e-08 ***
## TOEFL.Score -8.741e-03 5.980e-03 -1.462 0.144417
## University.Rating 2.412e-02 2.566e-02 0.940 0.347657
## SOP 1.441e-02 3.108e-02 0.464 0.643147
## LOR 1.404e-02 2.840e-02 0.494 0.621210
## CGPA -9.398e-02 7.457e-02 -1.260 0.208213
## Chance.of.Admit 1.065e+00 3.066e-01 3.474 0.000557 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 0.1633439)
##
## Null deviance: 123.200 on 499 degrees of freedom
## Residual deviance: 80.202 on 491 degrees of freedom
## AIC: 523.91
##
## Number of Fisher Scoring iterations: 2
plot(logmod)
#chance.vs.CGPA <- lm(admissionsData$Chance.of.Admit ~ admissionsData$CGPA)
#plot(admissionsData$Chance.of.Admit ~ admissionsData$CGPA, xlab = "Chance of Admission", ylab = "CGPA", main = "Chance of Admission VS CGPA")
#abline(chance.vs.CGPA , col="red", lwd=3, data = admissionsData)
By performing backwards selection, we will remove the least significant values until all values are significant.
linear <- lm(Chance.of.Admit~ ., data = admissionsData )
#summary(linear)
#Remove University Ranking because it has the highest non significant p value
linear <- lm(Chance.of.Admit~ GRE.Score + TOEFL.Score + SOP +LOR + CGPA + Research , data = admissionsData )
#summary(linear)
#Remove SOP has the second highest non significant p value
linear <- lm(Chance.of.Admit~ GRE.Score + TOEFL.Score +LOR + CGPA + Research , data = admissionsData )
#All variables are now significant
summary(linear)
##
## Call:
## lm(formula = Chance.of.Admit ~ GRE.Score + TOEFL.Score + LOR +
## CGPA + Research, data = admissionsData)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.265965 -0.023835 0.008003 0.035543 0.158379
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.3357018 0.0990753 -13.482 < 2e-16 ***
## GRE.Score 0.0018892 0.0005024 3.760 0.000190 ***
## TOEFL.Score 0.0030174 0.0008619 3.501 0.000506 ***
## LOR 0.0193203 0.0037939 5.092 5.04e-07 ***
## CGPA 0.1229798 0.0093018 13.221 < 2e-16 ***
## Research 0.0251649 0.0065988 3.814 0.000154 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.06007 on 494 degrees of freedom
## Multiple R-squared: 0.8207, Adjusted R-squared: 0.8188
## F-statistic: 452.1 on 5 and 494 DF, p-value: < 2.2e-16
plot(linear)
linear <- lm(Research~ Serial.No. + GRE.Score + TOEFL.Score + University.Rating + SOP +LOR + CGPA, data = admissionsData )
#summary(linear)
#Remove SOP
linear <- lm(Research~ Serial.No. + GRE.Score + TOEFL.Score + University.Rating +LOR + CGPA, data = admissionsData )
#summary(linear)
#Remove SOP, CGPA
linear <- lm(Research~ Serial.No. + GRE.Score + TOEFL.Score + University.Rating +LOR, data = admissionsData )
#summary(linear)
#Remove SOP, CGPA, LOR
linear <- lm(Research~ Serial.No. + GRE.Score + TOEFL.Score + University.Rating, data = admissionsData )
#summary(linear)
#Remove SOP, CGPA, LOR, TOEFL
linear <- lm(Research~ Serial.No. + GRE.Score + University.Rating, data = admissionsData )
#summary(linear)
#Remove SOP, CGPA, LOR, TOEFL, Serial Number
linear <- lm(Research~ + GRE.Score + University.Rating, data = admissionsData )
summary(linear)
##
## Call:
## lm(formula = Research ~ +GRE.Score + University.Rating, data = admissionsData)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.14033 -0.35017 0.00906 0.29255 1.00181
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -6.415603 0.625451 -10.258 <2e-16 ***
## GRE.Score 0.021546 0.002099 10.266 <2e-16 ***
## University.Rating 0.050337 0.020731 2.428 0.0155 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4089 on 497 degrees of freedom
## Multiple R-squared: 0.3254, Adjusted R-squared: 0.3227
## F-statistic: 119.9 on 2 and 497 DF, p-value: < 2.2e-16
plot(linear)
linear <- lm(University.Rating~ Serial.No. + GRE.Score + TOEFL.Score + SOP +LOR + CGPA + Research, data = admissionsData )
summary(linear)
##
## Call:
## lm(formula = University.Rating ~ Serial.No. + GRE.Score + TOEFL.Score +
## SOP + LOR + CGPA + Research, data = admissionsData)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.34352 -0.46556 -0.03557 0.44046 2.44809
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -6.4170319 1.2125399 -5.292 1.82e-07 ***
## Serial.No. 0.0001812 0.0002260 0.802 0.42307
## GRE.Score 0.0065910 0.0059476 1.108 0.26833
## TOEFL.Score 0.0209679 0.0103520 2.025 0.04336 *
## SOP 0.4474027 0.0507642 8.813 < 2e-16 ***
## LOR 0.1498125 0.0488318 3.068 0.00227 **
## CGPA 0.3578395 0.1141124 3.136 0.00182 **
## Research 0.0923371 0.0783086 1.179 0.23891
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7109 on 492 degrees of freedom
## Multiple R-squared: 0.619, Adjusted R-squared: 0.6135
## F-statistic: 114.2 on 7 and 492 DF, p-value: < 2.2e-16
#Remove Serial Number
linear <- lm(University.Rating~ GRE.Score + TOEFL.Score + SOP +LOR + CGPA + Research, data = admissionsData )
summary(linear)
##
## Call:
## lm(formula = University.Rating ~ GRE.Score + TOEFL.Score + SOP +
## LOR + CGPA + Research, data = admissionsData)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.36251 -0.47140 -0.04223 0.45376 2.41297
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -6.295220 1.202548 -5.235 2.45e-07 ***
## GRE.Score 0.006468 0.005943 1.088 0.27705
## TOEFL.Score 0.020128 0.010295 1.955 0.05114 .
## SOP 0.441757 0.050255 8.790 < 2e-16 ***
## LOR 0.154072 0.048524 3.175 0.00159 **
## CGPA 0.364222 0.113793 3.201 0.00146 **
## Research 0.096184 0.078133 1.231 0.21890
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7106 on 493 degrees of freedom
## Multiple R-squared: 0.6185, Adjusted R-squared: 0.6138
## F-statistic: 133.2 on 6 and 493 DF, p-value: < 2.2e-16
#Remove GRE
linear <- lm(University.Rating~ TOEFL.Score + SOP +LOR + CGPA + Research, data = admissionsData )
summary(linear)
##
## Call:
## lm(formula = University.Rating ~ TOEFL.Score + SOP + LOR + CGPA +
## Research, data = admissionsData)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.37560 -0.47448 -0.03629 0.45065 2.41676
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -5.243653 0.715856 -7.325 9.79e-13 ***
## TOEFL.Score 0.025353 0.009109 2.783 0.00559 **
## SOP 0.440906 0.050259 8.773 < 2e-16 ***
## LOR 0.151540 0.048478 3.126 0.00188 **
## CGPA 0.414718 0.103920 3.991 7.59e-05 ***
## Research 0.120784 0.074805 1.615 0.10702
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7107 on 494 degrees of freedom
## Multiple R-squared: 0.6176, Adjusted R-squared: 0.6137
## F-statistic: 159.5 on 5 and 494 DF, p-value: < 2.2e-16
#Remove Research
linear <- lm(University.Rating~ TOEFL.Score + SOP +LOR + CGPA, data = admissionsData )
summary(linear)
##
## Call:
## lm(formula = University.Rating ~ TOEFL.Score + SOP + LOR + CGPA,
## data = admissionsData)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.46231 -0.46269 -0.04935 0.45262 2.39211
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -5.62010 0.67792 -8.290 1.07e-15 ***
## TOEFL.Score 0.02695 0.00907 2.971 0.00311 **
## SOP 0.44423 0.05030 8.832 < 2e-16 ***
## LOR 0.15563 0.04849 3.210 0.00142 **
## CGPA 0.44360 0.10254 4.326 1.83e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7119 on 495 degrees of freedom
## Multiple R-squared: 0.6155, Adjusted R-squared: 0.6124
## F-statistic: 198.1 on 4 and 495 DF, p-value: < 2.2e-16
logmod <- glm(Research~., data=admissionsData)
summary(logmod)
##
## Call:
## glm(formula = Research ~ ., data = admissionsData)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.01724 -0.33223 0.00753 0.29143 0.99776
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -4.738e+00 7.931e-01 -5.974 4.44e-09 ***
## Serial.No. 7.426e-05 1.313e-04 0.565 0.572113
## GRE.Score 1.921e-02 3.327e-03 5.776 1.36e-08 ***
## TOEFL.Score -8.741e-03 5.980e-03 -1.462 0.144417
## University.Rating 2.412e-02 2.566e-02 0.940 0.347657
## SOP 1.441e-02 3.108e-02 0.464 0.643147
## LOR 1.404e-02 2.840e-02 0.494 0.621210
## CGPA -9.398e-02 7.457e-02 -1.260 0.208213
## Chance.of.Admit 1.065e+00 3.066e-01 3.474 0.000557 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 0.1633439)
##
## Null deviance: 123.200 on 499 degrees of freedom
## Residual deviance: 80.202 on 491 degrees of freedom
## AIC: 523.91
##
## Number of Fisher Scoring iterations: 2
plot(logmod)
#Removed LOR
logmod <- glm(Research~Serial.No. + GRE.Score + TOEFL.Score + University.Rating + SOP + CGPA, data=admissionsData)
summary(logmod)
##
## Call:
## glm(formula = Research ~ Serial.No. + GRE.Score + TOEFL.Score +
## University.Rating + SOP + CGPA, data = admissionsData)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.09472 -0.33171 0.01616 0.28395 1.02222
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -6.3439527 0.6572538 -9.652 < 2e-16 ***
## Serial.No. 0.0001857 0.0001291 1.438 0.151
## GRE.Score 0.0216466 0.0032778 6.604 1.04e-10 ***
## TOEFL.Score -0.0054357 0.0059720 -0.910 0.363
## University.Rating 0.0344431 0.0256319 1.344 0.180
## SOP 0.0302826 0.0298521 1.014 0.311
## CGPA 0.0443213 0.0648898 0.683 0.495
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 0.1670782)
##
## Null deviance: 123.20 on 499 degrees of freedom
## Residual deviance: 82.37 on 493 degrees of freedom
## AIC: 533.24
##
## Number of Fisher Scoring iterations: 2
#plot(logmod)
#Removed LOR, CGPA
logmod <- glm(Research~Serial.No. + GRE.Score + TOEFL.Score + University.Rating + SOP , data=admissionsData)
summary(logmod)
##
## Call:
## glm(formula = Research ~ Serial.No. + GRE.Score + TOEFL.Score +
## University.Rating + SOP, data = admissionsData)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.09070 -0.33673 0.01374 0.28546 1.03378
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -6.4208255 0.6471960 -9.921 < 2e-16 ***
## Serial.No. 0.0001936 0.0001285 1.506 0.133
## GRE.Score 0.0225907 0.0029705 7.605 1.45e-13 ***
## TOEFL.Score -0.0042408 0.0057070 -0.743 0.458
## University.Rating 0.0375039 0.0252235 1.487 0.138
## SOP 0.0358048 0.0287209 1.247 0.213
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 0.1668978)
##
## Null deviance: 123.200 on 499 degrees of freedom
## Residual deviance: 82.448 on 494 degrees of freedom
## AIC: 531.72
##
## Number of Fisher Scoring iterations: 2
#Removed LOR, CGPA, TOEFL
logmod <- glm(Research~Serial.No. + GRE.Score + University.Rating + SOP , data=admissionsData)
summary(logmod)
##
## Call:
## glm(formula = Research ~ Serial.No. + GRE.Score + University.Rating +
## SOP, data = admissionsData)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.09592 -0.34393 0.00147 0.29124 1.03427
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -6.3837643 0.6449795 -9.898 <2e-16 ***
## Serial.No. 0.0002017 0.0001280 1.575 0.116
## GRE.Score 0.0210992 0.0021887 9.640 <2e-16 ***
## University.Rating 0.0346801 0.0249243 1.391 0.165
## SOP 0.0319968 0.0282472 1.133 0.258
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 0.1667468)
##
## Null deviance: 123.20 on 499 degrees of freedom
## Residual deviance: 82.54 on 495 degrees of freedom
## AIC: 530.27
##
## Number of Fisher Scoring iterations: 2
#Removed LOR, CGPA, TOEFL, SOP
logmod <- glm(Research~Serial.No. + GRE.Score + University.Rating , data=admissionsData)
summary(logmod)
##
## Call:
## glm(formula = Research ~ Serial.No. + GRE.Score + University.Rating,
## data = admissionsData)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.10835 -0.34957 0.00049 0.28952 1.02269
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -6.5389338 0.6304444 -10.372 <2e-16 ***
## Serial.No. 0.0001855 0.0001272 1.458 0.1455
## GRE.Score 0.0217887 0.0021030 10.361 <2e-16 ***
## University.Rating 0.0504027 0.0207077 2.434 0.0153 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 0.166842)
##
## Null deviance: 123.200 on 499 degrees of freedom
## Residual deviance: 82.754 on 496 degrees of freedom
## AIC: 529.57
##
## Number of Fisher Scoring iterations: 2
#Removed LOR, CGPA, TOEFL, SOP, Serial Number
logmod <- glm(Research~ GRE.Score + University.Rating , data=admissionsData)
summary(logmod)
##
## Call:
## glm(formula = Research ~ GRE.Score + University.Rating, data = admissionsData)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.14033 -0.35017 0.00906 0.29255 1.00181
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -6.415603 0.625451 -10.258 <2e-16 ***
## GRE.Score 0.021546 0.002099 10.266 <2e-16 ***
## University.Rating 0.050337 0.020731 2.428 0.0155 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 0.1672199)
##
## Null deviance: 123.200 on 499 degrees of freedom
## Residual deviance: 83.108 on 497 degrees of freedom
## AIC: 529.71
##
## Number of Fisher Scoring iterations: 2
#plot(logmod)
set.seed(7861)
cvlm <- list()
msecv <- NA
for(i in 1:nrow(admissionsData)){
#Fit the linear model
cvlm[[i]] <- lm(Chance.of.Admit[-i] ~ GRE.Score[-i] + TOEFL.Score[-i] +LOR[-i] + CGPA[-i] + Research[-i])
# Calculate MSE for ith model
msecv[i] <- (predict(cvlm[[i]], newdata = data.frame( GRE.Score[-i] + TOEFL.Score[-i] +LOR[-i] + CGPA[-i] + Research[-i]))-Chance.of.Admit[i])^2
#msecv[i]
}
#output mean of MSE
mean(msecv)
## [1] 0.07472015
set.seed(7861)
cvlm <- list()
msecv <- NA
for(i in 1:nrow(admissionsData)){
#Fit the linear model
cvlm[[i]] <- lm(Chance.of.Admit[-i] ~ CGPA[-i])
# Calculate MSE for ith model
msecv[i] <- (predict(cvlm[[i]], newdata = data.frame(CGPA[-i]))-Chance.of.Admit[i])^2
#msecv[i]
}
#output mean of MSE
mean(msecv)
## [1] 0.06879746
ResearchData <- admissionsData$Research
ResearchDataFactor <- factor(admissionsData$Research)
simlog<-glm(factor(Research)~., family = "binomial", data = admissionsData)
table(predict(simlog, type = "response")>0.5, ResearchData)
## ResearchData
## 0 1
## FALSE 154 57
## TRUE 66 223
misclassificationRate <- (57+66)/(154+223)
capture.output(cat('Misclassification rate = ', misclassificationRate))
## [1] "Misclassification rate = 0.3262599"
library(MLmetrics)
##
## Attaching package: 'MLmetrics'
## The following object is masked from 'package:base':
##
## Recall
F1<- F1_Score(as.numeric(predict(simlog, type = "response")>0.5), ResearchData)
Accu <- Accuracy(as.numeric(predict(simlog, type = "response")>0.5), ResearchData)
Sens <- Sensitivity(as.numeric(predict(simlog, type = "response")>0.5), ResearchData)
scoreTable <-cbind(F1, Accu, Sens, misclassificationRate)
colnames(scoreTable)<-c("F1 Score", "Accuracy", "Sensitivity", "Misclassification")
rownames(scoreTable)<-c("Logistic Regression")
#rownames(scoreTable)<-c("Logistic Regression", "Neural Network")
round(scoreTable,3)
## F1 Score Accuracy Sensitivity Misclassification
## Logistic Regression 0.715 0.754 0.73 0.326
set.seed(7861)
cvlm <- list()
msecv <- NA
for(i in 1:nrow(admissionsData)){
#Fit the linear model
cvlm[[i]] <- lm(University.Rating[-i] ~ TOEFL.Score[-i] + SOP[-i] + LOR[-1] + CGPA[-i])
# Calculate MSE for ith model
msecv[i] <- (predict(cvlm[[i]], newdata = data.frame(TOEFL.Score[-i] + SOP[-i] + LOR[-1] + CGPA[-i]))-University.Rating[i])^2
#msecv[i]
}
#output mean of MSE
mean(msecv)
## [1] 3.373402
newboots <- list()
bootsmod <- list()
B <- 10000
bootcoef <- matrix(nrow = B, ncol=length(linear$coefficients))
for(i in 1:B){
newboots[[i]] <- admissionsData[sample(1:nrow(admissionsData), nrow(admissionsData), replace=TRUE),]
bootsmod[[i]] <- lm(Chance.of.Admit~GRE.Score + TOEFL.Score +LOR + CGPA + Research, data=newboots[[i]])
for(j in 1:length(linear$coefficients)){
bootcoef[i,j] <- bootsmod[[i]]$coefficients[j]
}
}
jk <- function(vec) {
sqrt((length(admissionsData)-1)/(length(admissionsData))*sum((vec-mean(vec))^2))
}
#Standard Deviation from linear model coefficients
summary(linear)$coefficients[,2]
## (Intercept) TOEFL.Score SOP LOR CGPA
## 0.677924412 0.009069654 0.050297907 0.048489923 0.102535141
#Standard Deviation from non-parametric bootstrap on linear model coefficients
c(sd(bootcoef[,1]),sd(bootcoef[,2]),sd(bootcoef[,3]),sd(bootcoef[,4]),sd(bootcoef[,5]))
## [1] 0.1136140857 0.0005446949 0.0007221458 0.0035845184 0.0092885531
#Final Coefficients for bootstrapped linear model.
c(mean(bootcoef[,1]),mean(bootcoef[,2]),mean(bootcoef[,3]),mean(bootcoef[,4]),mean(bootcoef[,5]))
## [1] -1.334316606 0.001880635 0.003020571 0.019340184 0.123082839
newboots <- list()
bootsmod <- list()
bootcoef <- matrix(nrow = B, ncol=length(linear$coefficients))
for(i in 1:B){
newboots[[i]] <- admissionsData[sample(1:nrow(admissionsData), nrow(admissionsData), replace=TRUE),]
bootsmod[[i]] <- glm(Research~GRE.Score+University.Rating, data=newboots[[i]])
for(j in 1:length(linear$coefficients)){
bootcoef[i,j] <- bootsmod[[i]]$coefficients[j]
}
}
#Standard Error on coefficients of one logistic regression
summary(logmod)$coefficients[,2]
## (Intercept) GRE.Score University.Rating
## 0.625450864 0.002098797 0.020731070
#Standard Error on coefficients after applying non-parametric bootstrap
c(sd(bootcoef[,1]),sd(bootcoef[,2]),sd(bootcoef[,3]))
## [1] 0.607448059 0.002057252 0.020905327
#Final coefficients for bootstrapped logistic regression
c(mean(bootcoef[,1]),mean(bootcoef[,2]),mean(bootcoef[,3]))
## [1] -6.41459810 0.02154669 0.05008033